Build a classification model with Decision Trees. The main objective is to use the two splitting criteria of Gini index and Gain ratio and and observe the performance of the decision tree on the given data set. It is a real dataset about the students' knowledge status about the subject of Electrical DC Machines.
# Convert jupyter notebook into full screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
# Reset all variables and objects in notebook
%reset -f
from os import chdir, getcwd
wd = getcwd()
wd
Import pandas,seaborn, and the usual libraries.
#!pip install chefboost
%%html
<style>
img {align:left}
</style>
from chefboost import Chefboost as chef
# for loading dataset
from dataprep.datasets import load_dataset
# importing function from DataPrep.eda
from dataprep.eda import create_report
import dalex as dx
from datetime import datetime
from genetic_selection import GeneticSelectionCV
import graphviz
#Import data from your laptop's local folder
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
import io
from IPython.display import display
import itertools
import joblib
# os.sys.modules['sklearn.externals.joblib'] = joblib
from tensorflow import keras
from keras_tuner import RandomSearch
from keras.models import Sequential
from keras.layers import Dense,Dropout
from lightgbm import LGBMRegressor
import matplotlib.dates
import matplotlib.pyplot as plt
%matplotlib inline
import missingno as msno
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from mlxtend.plotting import plot_confusion_matrix
import numpy as np
from numpy.random import randn
from numpy.random import seed
from operator import sub
import os
import pandas as pd
import pandas_profiling
plt.rcParams["font.family"] = 'DejaVu Sans'
import re
from rfpimp import permutation_importances
from scipy import stats
from scipy.stats import boxcox
from scipy.stats import pearsonr
from scipy.stats import skew
import seaborn as sns
import sklearn
from sklearn import metrics
from sklearn import preprocessing #for scaling and pre-processing data
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble._forest import ForestClassifier, ForestRegressor
from sklearn.experimental import enable_iterative_imputer #enable th experimental feature of interative imputer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.impute import SimpleImputer # used for handling missing data
from sklearn.impute import IterativeImputer #the IterativeImputer in scikit-learn (view documentation) utilizes the data
#available in other features in order to estimate the missing values being
#imputed
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.manifold import TSNE
from sklearn import metrics
from sklearn import linear_model
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss, make_scorer
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split # used for splitting training and testing data
from sklearn.model_selection import train_test_split #used for splitting data into training data and testing data
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder # used for encoding categorical data
from sklearn.preprocessing import LabelEncoder # used for encoding categorical data
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import StandardScaler # used for feature scaling
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn import svm
from sklearn.svm import SVC
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.eval_measures import rmse
from IPython.core.interactiveshell import InteractiveShell # display complete output cell
import time
import warnings
warnings.filterwarnings('ignore')
inputData = pd.read_csv("04.00.00 Predict_student_ knowledge_level.csv")
inputData.info()
Columns 6, 7 and 8 seem to be empty with 0 non-null values, hence we will drop these three columns. After checking the CSV opened in excel, we confirmed that the these columns are null and have no values in them
inputData.head(10)
sns.heatmap(inputData.isnull(), cbar=False)
msno.matrix(inputData)
#Find number of missing cells in descending order
inputData.isnull().sum().sort_values(ascending=False)
#Find % of missing values in descending order
inputData.isnull().sum().sort_values(ascending=False)/len(inputData)
inputData.drop(inputData.columns[[6, 7, 8]], axis=1, inplace=True)
inputData.info()
# Selecting duplicate rows except first occurrence based on all columns
duplicateRows = inputData[inputData.duplicated()]
print("Duplicate Rows")
duplicateRows
# Selecting duplicate columns except first occurrence based on all rows
duplicateColumns = inputData.loc[:,inputData.apply(lambda x: x.duplicated(),axis=1).all()].copy()
print("Duplicate Columns")
duplicateColumns
def getDuplicateColumns(df):
'''
Get a list of duplicate columns.
It will iterate over all the columns in dataframe and find the columns whose contents are duplicate.
:param df: Dataframe object
:return: List of columns whose contents are duplicates.
'''
duplicateColumnNames = set()
# Iterate over all the columns in dataframe
for x in range(df.shape[1]):
# Select column at xth index.
col = df.iloc[:, x]
# Iterate over all the columns in DataFrame from (x+1)th index till end
for y in range(x + 1, df.shape[1]):
# Select column at yth index.
otherCol = df.iloc[:, y]
# Check if two columns at x 7 y index are equal
if col.equals(otherCol):
duplicateColumnNames.add(df.columns.values[y])
return list(duplicateColumnNames)
duplicateColumnNames = getDuplicateColumns(inputData)
#Print Duplicate Columns
duplicateColumnNames
# Function to return the constant value columns of a given DataFrame
def remove_constant_value_features(df):
return [e for e in df.columns if df[e].nunique() == 1]
drop_col = remove_constant_value_features(inputData)
drop_col
#inputData.columns[inputData1.nunique() <= 1]
#Check if all but the last columns are float columns or integer columns. If they have special characters they will not be float type or integer type
inputData.dtypes
# remove spaces in columns name
inputData.columns = inputData.columns.str.replace(' ','')
#Check if all but the last columns are float columns or integer columns. If they have special characters they will not be float type or integer type
inputData.dtypes
dataplot = sns.heatmap(inputData.corr(), cmap="YlGnBu", annot=True)
inputData.corr()
#Copy Dataframe into a new variable
CopiedData=inputData[["STG", "SCG", "STR", "LPR", "PEG"]].copy(deep=True)
def calc_vif(X):
# Calculating VIF
vif = pd.DataFrame()
vif["variables"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
return(vif)
#ensure you don't select Rating column
X = CopiedData.loc[:, ~CopiedData.columns.isin(['Rating'])]
calc_vif(X)
We see that PEG has the highest VIF of 3.994313, but since all VIF values are less than 5 we don't need to remove any features
To understand correlation between independent variables better, we will create a pair plot
sns.pairplot(CopiedData)
plt.title('No of Datapoints per UNS',fontsize=20)
sns.countplot(inputData.UNS)
plt.xticks(rotation=75)
plt.show()
inputData["UNS"] = inputData["UNS"].replace('very_low','Very Low')
plt.title('No of Datapoints per UNS',fontsize=20)
sns.countplot(inputData.UNS)
plt.xticks(rotation=75)
plt.show()
# Plot outliers in box plot for X variables
# boxplot = inputData.boxplot(column=inputData.columns['Distance', 'Cost','Discount','Delivery charges','Surge charges','Packaging charges','ST','Tip'])
dfBoxPlot=inputData.select_dtypes(include='number')
dfBoxPlot.info()
boxplot = dfBoxPlot.boxplot(column=dfBoxPlot.columns.values.tolist())#[ :-1]
for col in dfBoxPlot:
plt.figure()
dfBoxPlot.boxplot(column=[col])
#remove outliers using IQR Range basis guidance from NIST
#https://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm
inputDataPlay=inputData.copy(deep=True)
inputDataPlay.info(verbose=True,memory_usage='deep',show_counts=True)
col_name = inputDataPlay.select_dtypes(include='number').columns.values.tolist()
Q1 = inputDataPlay[col_name].quantile(0.25)
Q3 = inputDataPlay[col_name].quantile(0.75)
IQR = Q3 - Q1
lower_boundary = Q1 - (1.5 * IQR)
upper_boundary = Q3 + (1.5 * IQR)
for column in col_name:
inputDataPlay = inputDataPlay[inputDataPlay[column] >= lower_boundary[column]]
inputDataPlay = inputDataPlay[inputDataPlay[column] <= upper_boundary[column]]
#print(df)
inputDataPlay.info(verbose=True,memory_usage='deep',show_counts=True)
inputDataPlay.describe(include='all')
# Plot outliers in box plot for X variables
# boxplot = inputData.boxplot(column=inputData.columns['Distance', 'Cost','Discount','Delivery charges','Surge charges','Packaging charges','ST','Tip'])
dfBoxPlot=inputDataPlay.select_dtypes(include='number')
dfBoxPlot.info()
boxplot = dfBoxPlot.boxplot(column=dfBoxPlot.columns.values.tolist())#[ :-1]
inputDataPlay.boxplot(layout=(2,3), by='UNS', figsize=[15,10])
Convert category values into integer values
columns = ['UNS']
for col in columns:
inputDataPlay[col] = inputDataPlay[col].astype('category')
inputDataPlay.info()
inputDataPlay["UNS"] = inputDataPlay.UNS.map({"Very Low":0,"Low":1,"Middle":2,"High":3})
inputDataPlay.info()
inputDataPlay.head(10)
X = inputDataPlay.loc[:, ~inputDataPlay.columns.isin(['UNS'])]
X
y = inputDataPlay['UNS']
y
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train.head(5)
#create train and test dataframes for ChefBoost implementation using Gain Ratio
dfTrain = pd.concat([X_train, y_train], axis=1)
dfTrain['UNS'] = dfTrain.UNS.astype(str)
dfTrain.head(5)
dfTrain.info()
dfTest = pd.concat([X_test, y_test], axis=1)
dfTest['UNS'] = dfTest.UNS.astype(str)
dfTest.head(5)
dfTest.info()
y_train.value_counts()
smot = RandomOverSampler(random_state=42)
X_train_smote,y_train_smote = smot.fit_resample(X_train,y_train)
y_train_smote.value_counts()
#create train and test dataframes for ChefBoost implementation using Gain Ratio
dfTrain_smote = pd.concat([X_train_smote, y_train_smote], axis=1)
dfTrain_smote['UNS'] = dfTrain_smote.UNS.astype(str)
dfTrain_smote.head(5)
dfTrain_smote.info()
labels=['STG', 'SCG','STR','LPR','PEG']
classes=['Very Low','Low','Middle','High']
### Create Array to store results
# create a Feature_Subset_Result array
dFFeature_Subset_Result = pd.DataFrame(columns=['Best Estimator', 'Model',
'Training Time', 'Testing Time',
'Training Set Accuracy', 'Testing Set Accuracy',
'Training Set Confusion Matrix', 'Testing Set Confusion Matrix',
'Classifiction Report', 'Parameters of best estimator',
'Avg. Cross Validation Score of Best Estimator',
'Total number of cross validation sets','FPR','TPR','P_FPR','P_TPR','ccp_alphas',
'X_train','y_train','X_test','y_test'])
Feature_Subset_Result = []
plt.rcParams["font.family"] = 'DejaVu Sans'
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=90)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# helper function
def plot_confusionmatrix(y_train_pred,y_train,dom):
print(f'{dom} Confusion matrix')
cf = confusion_matrix(y_train_pred,y_train)
sns.heatmap(cf,annot=True,yticklabels=classes
,xticklabels=classes,cmap='Blues', fmt='g')
plt.tight_layout()
plt.show()
from datetime import datetime
def perform_model(model, X_train, y_train, X_test, y_test, class_labels, cm_normalize=True, \
print_cm=True, cm_cmap=plt.cm.Greens):
# to store results at various phases
results = dict()
results['X_train'] = X_train
results['y_train'] = y_train
results['X_test'] = X_test
results['y_test'] = y_test
# time at which model starts training
train_start_time = datetime.now()
print('training the model..')
model.fit(X_train, y_train)
print('Done \n \n')
train_end_time = datetime.now()
results['training_time'] = train_end_time - train_start_time
print('---------------------')
print('| Training Time |')
print('---------------------')
print('training_time(HH:MM:SS.ms) - {}\n\n'.format(results['training_time']))
results['model_name_'] = str(model)
#print(str(model))
# predict test data
print('Predicting test data')
test_start_time = datetime.now()
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)
test_end_time = datetime.now()
print('Done \n \n')
results['testing_time'] = test_end_time - test_start_time
print('---------------------')
print('| Testing Time |')
print('---------------------')
print('testing time(HH:MM:SS:ms) - {}\n\n'.format(results['testing_time']))
results['predicted'] = y_pred
# calculate overall training accuracy of the model
train_accuracy = metrics.accuracy_score(y_true=y_train, y_pred=y_pred_train)
# store accuracy in results
results['TrainingAccuracy'] = train_accuracy
print('---------------------')
print('| Training Set Accuracy |')
print('---------------------')
print('\n {}\n\n'.format(train_accuracy))
# calculate overall testing accuracy of the model
test_accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
# store accuracy in results
results['TestingAccuracy'] = test_accuracy
print('---------------------')
print('| Testing Set Accuracy |')
print('---------------------')
print('\n {}\n\n'.format(test_accuracy))
# Training set confusion matrix
cm_train = metrics.confusion_matrix(y_train, y_pred_train)
results['confusion_matrix_train'] = cm_train
if print_cm:
print('-----------------------------')
print('| Train Set Confusion Matrix |')
print('-----------------------------')
print('\n {}'.format(cm_train))
# Test set confusion matrix
cm_test = metrics.confusion_matrix(y_test, y_pred)
results['confusion_matrix_test'] = cm_test
if print_cm:
print('\n-----------------------------')
print('| Test Set Confusion Matrix |')
print('-----------------------------')
print('\n {}'.format(cm_test))
# plot train-set confusion matrix
plt.figure(figsize=(8,8))
plt.grid(b=False)
#print(class_labels)
plot_confusion_matrix(cm_train, class_labels, normalize=True, title='Train Set Normalized confusion matrix', cmap = cm_cmap)
plt.show()
# plot test-set confusion matrix
plt.figure(figsize=(8,8))
plt.grid(b=False)
#print(class_labels)
plot_confusion_matrix(cm_test, class_labels, normalize=True, title='Test Set Normalized confusion matrix', cmap = cm_cmap)
plt.show()
# get classification report
print('-------------------------')
print('| Classification Report |')
print('-------------------------')
classification_report = metrics.classification_report(y_test, y_pred)
# store report in results
results['classification_report'] = classification_report
print(classification_report)
# get ROC Score and Curve
print('-------------------------')
print('| ROC Curve |')
print('-------------------------')
# predict probabilities
pred_prob = model.predict_proba(X_test)
# roc curve for models
fpr, tpr, thresh = roc_curve(y_test, pred_prob[:,1], pos_label=1)
results['fpr'] = fpr
results['tpr'] = tpr
# roc curve for tpr = fpr (blue line with 50% area)
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)
results['p_fpr'] = p_fpr
results['p_tpr'] = p_tpr
# auc scores
# auc_score = roc_auc_score(y_test, pred_prob[:,1])
# plot roc curves
plt.plot(fpr, tpr, linestyle='--',color='orange', label="")
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.rcParams['figure.figsize'] = [10, 6]
plt.show();
# add the trained model to the results
results['model'] = model
# Plot Decision Tree via. Plot_tree
print('------------------------------------')
print('| Decision Tree via. Plot_tree |')
print('------------------------------------')
final_model = model.best_estimator_
final_model_fit = final_model.fit(X_test, y_test)
#final_model.fit(X_train, y_train)
#y_predict = final_model.predict(X_test)
_, ax = plt.subplots(figsize=(30,30)) # Resize figure
#fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(final_model_fit, filled=True, feature_names = labels, ax=ax)
plt.title("Decision trees")
#plt.figure(figsize=(40,40))
plt.show()
# Plot Decision Tree via. graphviz
print('------------------------------------')
print('| Decision Tree via. graphviz |')
print('------------------------------------')
#dot_data = export_graphviz(model.best_estimator_, out_file=None, filled=True, rounded=True, feature_names=labels) #, class_names=['0','1','2']
dot_data = tree.export_graphviz(final_model_fit, feature_names = labels, filled = True, rounded=True, special_characters=True, out_file=None)
graph = graphviz.Source(dot_data)
display(graph)
print('--------------------------------------')
print('| Decision Tree Description |')
print('--------------------------------------')
n_nodes = final_model_fit.tree_.node_count
children_left = final_model_fit.tree_.children_left
children_right = final_model_fit.tree_.children_right
feature = final_model_fit.tree_.feature
threshold = final_model_fit.tree_.threshold
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, 0)] # start with the root node id (0) and its depth (0)
while len(stack) > 0:
# `pop` ensures each node is only visited once
node_id, depth = stack.pop()
node_depth[node_id] = depth
# If the left and right child of a node is not the same we have a split
# node
is_split_node = children_left[node_id] != children_right[node_id]
# If a split node, append left and right children and depth to `stack`
# so we can loop through them
if is_split_node:
stack.append((children_left[node_id], depth + 1))
stack.append((children_right[node_id], depth + 1))
else:
is_leaves[node_id] = True
print(
"The binary tree structure has {n} nodes and has "
"the following tree structure:\n".format(n=n_nodes)
)
for i in range(n_nodes):
if is_leaves[i]:
print(
"{space}node={node} is a leaf node.".format(
space=node_depth[i] * "\t", node=i
)
)
else:
print(
"{space}node={node} is a split node: "
"go to node {left} if X[:, {feature}] <= {threshold} "
"else to node {right}.".format(
space=node_depth[i] * "\t",
node=i,
left=children_left[i],
feature=feature[i],
threshold=threshold[i],
right=children_right[i],
)
)
path = final_model.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print('\n--------------------------------------')
print('| ccp_alphas for Post Pruning |')
print('--------------------------------------\n')
print(ccp_alphas)
results['ccp_alphas'] = ccp_alphas
return results
def print_grid_search_attributes(log_reg_grid_results):
model = log_reg_grid_results['model']
# Estimator that gave highest score among all the estimators formed in GridSearch
print('--------------------------')
print('| Best Estimator |')
print('--------------------------')
print('\n\t{}\n'.format(model.best_estimator_))
# parameters that gave best results while performing grid search
print('--------------------------')
print('| Best parameters |')
print('--------------------------')
print('\tParameters of best estimator : \n\n\t{}\n'.format(model.best_params_))
# number of cross validation splits
print('---------------------------------')
print('| No of CrossValidation sets |')
print('--------------------------------')
print('\n\tTotal number of cross validation sets: {}\n'.format(model.n_splits_))
# Average cross validated score of the best estimator, from the Grid Search
print('--------------------------')
print('| Best Score |')
print('--------------------------')
print('\n\tAverage Cross Validate scores of best estimator : \n\n\t{}\n'.format(model.best_score_))
Feature_Subset_Result.append((model.best_estimator_, str(model), log_reg_grid_results['training_time'], log_reg_grid_results['testing_time'],
log_reg_grid_results['TrainingAccuracy'], log_reg_grid_results['TestingAccuracy'],
log_reg_grid_results['confusion_matrix_train'], log_reg_grid_results['confusion_matrix_test'],
log_reg_grid_results['classification_report'],
model.best_params_, model.best_score_, model.n_splits_, log_reg_grid_results['fpr'], log_reg_grid_results['tpr'],
log_reg_grid_results['p_fpr'], log_reg_grid_results['p_tpr'],
log_reg_grid_results['ccp_alphas'],
log_reg_grid_results['X_train'],log_reg_grid_results['y_train'],log_reg_grid_results['X_test'],log_reg_grid_results['y_test']))
# start Grid search with Gini Criterion
parameters = {'criterion': ['gini'],
'splitter': ['best', 'random'],
'max_depth':[None],
'min_samples_split':[2],
'min_samples_leaf':[1],
'min_weight_fraction_leaf':[0.0],
'max_features':['auto', 'sqrt', 'log2'],
'random_state':[0],
'max_leaf_nodes':[None],
'min_impurity_decrease':[0.0],
'class_weight':[None,'balanced'],
'ccp_alpha':[0.0]}
des_tree_gini = DecisionTreeClassifier()
des_tree_gini_grid = GridSearchCV(des_tree_gini, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_gini_grid_results = perform_model(des_tree_gini_grid, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_gini_grid_results)
# start Grid search with Gini Criterion
parameters = {'criterion': ['gini'],
'splitter': ['best', 'random'],
'max_depth':[None],
'min_samples_split':[2],
'min_samples_leaf':[1],
'min_weight_fraction_leaf':[0.0],
'max_features':['auto', 'sqrt', 'log2'],
'random_state':[0],
'max_leaf_nodes':[None],
'min_impurity_decrease':[0.0],
'class_weight':[None,'balanced'],
'ccp_alpha':[0.0]}
des_tree_gini = DecisionTreeClassifier()
des_tree_gini_grid = GridSearchCV(des_tree_gini, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_gini_grid_results = perform_model(des_tree_gini_grid, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_gini_grid_results)
The following regular decision tree algorithms are wrapped in the ChefBoost library.
So we will use the C4.5 method of ChefBoost to use Gain Ratio and create our model
# to store results at various phases
ChefBoostresults = dict()
ChefBoostresults['X_train'] = X_train
ChefBoostresults['y_train'] = y_train
ChefBoostresults['X_test'] = X_test
ChefBoostresults['y_test'] = y_test
#Set algorithm to ID3, C4.5, CART, CHAID or Regression
config = {'algorithm': 'C4.5', 'enableParallelism': [False], 'num_cores': [1]}
# time at which model starts training
train_start_time = datetime.now()
print('training the model..')
model = chef.fit(dfTrain.copy(deep=True), config, target_label = 'UNS')
print('Done \n \n')
train_end_time = datetime.now()
ChefBoostresults['training_time'] = train_end_time - train_start_time
print('---------------------')
print('| Training Time |')
print('---------------------')
print('training_time(HH:MM:SS.ms) - {}\n\n'.format(ChefBoostresults['training_time']))
Check Prediction for the first column
dfTrain.iloc[0]
prediction = chef.predict(model, dfTrain.iloc[0])
prediction
Check Prediction for enire data set
for index, instance in dfTrain.iterrows():
prediction = chef.predict(model, instance)
actual = instance['UNS']
if actual == prediction:
classified = True
else:
classified = False
print("*",end='')
#Mark predictions which are not right, with a * mark at the start
print ("Actual: ", actual, "--Prediction: ", prediction, "--Error: ", int(prediction) - int(actual))
Check Feature Importance
rules = "outputs/rules/rules.py"
fi = chef.feature_importance(rules).set_index("feature")
fi.plot(kind="barh", title="Feature Importance");
print(fi)
This shows that the STG Feature is the most important feature, post which we have the LPR and SCG feature, after which is the PEG Feature.
Evaluate the model using dfTest
# predict test data
print('Predicting test data')
test_start_time = datetime.now()
chef.evaluate(model, dfTest, task="test", target_label = 'UNS')
test_end_time = datetime.now()
print('Done \n \n')
ChefBoostresults['testing_time'] = test_end_time - test_start_time
print('---------------------')
print('| Testing Time |')
print('---------------------')
print('testing time(HH:MM:SS:ms) - {}\n\n'.format(ChefBoostresults['testing_time']))
Feature_Subset_Result.append(("chef.fit", "chef.fit(dfTrain.copy(deep=True), config = {'algorithm': 'C4.5', 'enableParallelism': [False], 'num_cores': [1]}, target_label = 'UNS')",
ChefBoostresults['training_time'], ChefBoostresults['testing_time'],
"0.7086", "0.650",
"[[90, 15, 32, 0], [0, 69, 0, 33], [0, 0, 3, 0], [0, 1, 0, 35]]", "[[9, 0, 1, 0], [0, 38, 7, 13], [20, 1, 29, 0], [0, 0, 0, 2]]",
"-",
"", "", "", "", "",
"", "",
"",
ChefBoostresults['X_train'],ChefBoostresults['y_train'],ChefBoostresults['X_test'],ChefBoostresults['y_test']))
So we will use the C4.5 method of ChefBoost to use Gain Ratio and create our model
# to store results at various phases
ChefBoostresults = dict()
ChefBoostresults['X_train'] = X_train_smote
ChefBoostresults['y_train'] = y_train_smote
ChefBoostresults['X_test'] = X_test
ChefBoostresults['y_test'] = y_test
#Set algorithm to ID3, C4.5, CART, CHAID or Regression
config = {'algorithm': 'C4.5', 'enableParallelism': [False], 'num_cores': [1]}
# time at which model starts training
train_start_time = datetime.now()
print('training the model..')
model = chef.fit(dfTrain_smote.copy(deep=True), config, target_label = 'UNS')
print('Done \n \n')
train_end_time = datetime.now()
ChefBoostresults['training_time'] = train_end_time - train_start_time
print('---------------------')
print('| Training Time |')
print('---------------------')
print('training_time(HH:MM:SS.ms) - {}\n\n'.format(ChefBoostresults['training_time']))
Check Prediction for the first column
dfTrain_smote.iloc[0]
prediction = chef.predict(model, dfTrain_smote.iloc[0])
prediction
Check Prediction for enire data set
for index, instance in dfTrain_smote.iterrows():
prediction = chef.predict(model, instance)
actual = instance['UNS']
if actual == prediction:
classified = True
else:
classified = False
print("*",end='')
#Mark predictions which are not right, with a * mark at the start
print ("Actual: ", actual, "--Prediction: ", prediction, "--Error: ", int(prediction) - int(actual))
Check Feature Importance
rules = "outputs/rules/rules.py"
fi = chef.feature_importance(rules).set_index("feature")
fi.plot(kind="barh", title="Feature Importance");
print(fi)
This shows that the STG Feature is the most important feature, post which we have the LPR and SCG feature, after which is the PEG Feature.
Evaluate the model using dfTest
# predict test data
print('Predicting test data')
test_start_time = datetime.now()
chef.evaluate(model, dfTest, task="test", target_label = 'UNS')
test_end_time = datetime.now()
print('Done \n \n')
ChefBoostresults['testing_time'] = test_end_time - test_start_time
print('---------------------')
print('| Testing Time |')
print('---------------------')
print('testing time(HH:MM:SS:ms) - {}\n\n'.format(ChefBoostresults['testing_time']))
Feature_Subset_Result.append(("chef.fit", "chef.fit(dfTrain_smote.copy(deep=True), config = {'algorithm': 'C4.5', 'enableParallelism': [False], 'num_cores': [1]}, target_label = 'UNS')",
ChefBoostresults['training_time'], ChefBoostresults['testing_time'],
"0.6055", "0.6583",
"[[90, 17, 85, 0], [0, 72, 0, 39], [0, 0, 5, 0], [0, 1, 0, 51]]", "[[8, 0, 1, 0], [0, 38, 5, 13], [21, 1, 31, 0], [0, 0, 0, 2]]",
"-",
"", "", "", "", "",
"", "",
"",
ChefBoostresults['X_train'],ChefBoostresults['y_train'],ChefBoostresults['X_test'],ChefBoostresults['y_test']))
# start Grid search with Entropy Criterion
parameters = {'criterion': ['entropy'],
'splitter': ['best', 'random'],
'max_depth':[None],
'min_samples_split':[2],
'min_samples_leaf':[1],
'min_weight_fraction_leaf':[0.0],
'max_features':['auto', 'sqrt', 'log2'],
'random_state':[0],
'max_leaf_nodes':[None],
'min_impurity_decrease':[0.0],
'class_weight':[None,'balanced'],
'ccp_alpha':[0.0]}
des_tree_entropy = DecisionTreeClassifier()
des_tree_entropy_grid = GridSearchCV(des_tree_entropy, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_entropy_grid_results = perform_model(des_tree_entropy_grid, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_entropy_grid_results)
# start Grid search with Entropy Criterion
parameters = {'criterion': ['entropy'],
'splitter': ['best', 'random'],
'max_depth':[None],
'min_samples_split':[2],
'min_samples_leaf':[1],
'min_weight_fraction_leaf':[0.0],
'max_features':['auto', 'sqrt', 'log2'],
'random_state':[0],
'max_leaf_nodes':[None],
'min_impurity_decrease':[0.0],
'class_weight':[None,'balanced'],
'ccp_alpha':[0.0]}
des_tree_entropy = DecisionTreeClassifier()
des_tree_entropy_grid = GridSearchCV(des_tree_entropy, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_entropy_grid_results = perform_model(des_tree_entropy_grid, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_entropy_grid_results)
# start Grid search with Log_Loss Criterion
parameters = {'criterion': ['log_loss'],
'splitter': ['best', 'random'],
'max_depth':[None],
'min_samples_split':[2],
'min_samples_leaf':[1],
'min_weight_fraction_leaf':[0.0],
'max_features':['auto', 'sqrt', 'log2'],
'random_state':[0],
'max_leaf_nodes':[None],
'min_impurity_decrease':[0.0],
'class_weight':[None,'balanced'],
'ccp_alpha':[0.0]}
des_tree_logloss = DecisionTreeClassifier()
des_tree_logloss_grid = GridSearchCV(des_tree_logloss, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_logloss_grid_results = perform_model(des_tree_logloss_grid, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_logloss_grid_results)
# start Grid search with Log_Loss Criterion
parameters = {'criterion': ['log_loss'],
'splitter': ['best', 'random'],
'max_depth':[None],
'min_samples_split':[2],
'min_samples_leaf':[1],
'min_weight_fraction_leaf':[0.0],
'max_features':['auto', 'sqrt', 'log2'],
'random_state':[0],
'max_leaf_nodes':[None],
'min_impurity_decrease':[0.0],
'class_weight':[None,'balanced'],
'ccp_alpha':[0.0]}
des_tree_logloss = DecisionTreeClassifier()
des_tree_logloss_grid = GridSearchCV(des_tree_logloss, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_logloss_grid_results = perform_model(des_tree_logloss_grid, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_logloss_grid_results)
for item in Feature_Subset_Result:
dFFeature_Subset_Result.loc[dFFeature_Subset_Result.shape[0]] = item
print(dFFeature_Subset_Result[['Best Estimator','Training Set Accuracy', 'Testing Set Accuracy', 'Training Time', 'Testing Time',
'Avg. Cross Validation Score of Best Estimator']].to_markdown(tablefmt="pretty"))
Pre pruning is nothing but stoping the growth of decision tree on an early stage. For that we can limit the growth of trees by setting constrains. We can limit parameters like max_depth , min_samples etc.
An effective way to do is that we can grid search those parameters and choose the optimum values that gives better performace on test data.
As of now we will control these parameters
# start Grid search with Gini Criterion
parameters = {'criterion': ['gini'],
'splitter': ['best', 'random'],
'max_depth':[2,4,6,8,10,12],
'min_samples_split':[2,3,4],
'min_samples_leaf':[1,2],
'min_weight_fraction_leaf':[0.0],
'max_features':['auto', 'sqrt', 'log2'],
'random_state':[0],
'max_leaf_nodes':[None],
'min_impurity_decrease':[0.0],
'class_weight':[None,'balanced'],
'ccp_alpha':[0.0]} #for pruning use [0.0, 0.1, 0.01, 0.001]
des_tree_gini_prepruning = DecisionTreeClassifier()
des_tree_gini_grid_prepruning = GridSearchCV(des_tree_gini_prepruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_gini_grid_results_prepruning = perform_model(des_tree_gini_grid_prepruning, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_gini_grid_results_prepruning)
# start Grid search with Gini Criterion
parameters = {'criterion': ['gini'],
'splitter': ['best', 'random'],
'max_depth':[2,4,6,8,10,12],
'min_samples_split':[2,3,4],
'min_samples_leaf':[1,2],
'min_weight_fraction_leaf':[0.0],
'max_features':['auto', 'sqrt', 'log2'],
'random_state':[0],
'max_leaf_nodes':[None],
'min_impurity_decrease':[0.0],
'class_weight':[None,'balanced'],
'ccp_alpha':[0.0]} #for pruning use [0.0, 0.1, 0.01, 0.001]
des_tree_gini_prepruning = DecisionTreeClassifier()
des_tree_gini_grid_prepruning = GridSearchCV(des_tree_gini_prepruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_gini_grid_results_prepruning = perform_model(des_tree_gini_grid_prepruning, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_gini_grid_results_prepruning)
# start Grid search with Entropy Criterion
parameters = {'criterion': ['entropy'],
'splitter': ['best', 'random'],
'max_depth':[2,4,6,8,10,12],
'min_samples_split':[2,3,4],
'min_samples_leaf':[1,2],
'min_weight_fraction_leaf':[0.0],
'max_features':['auto', 'sqrt', 'log2'],
'random_state':[0],
'max_leaf_nodes':[None],
'min_impurity_decrease':[0.0],
'class_weight':[None,'balanced'],
'ccp_alpha':[0.0]}
des_tree_entropy_prepruning = DecisionTreeClassifier()
des_tree_entropy_grid_prepruning = GridSearchCV(des_tree_entropy_prepruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_entropy_grid_results_prepruning = perform_model(des_tree_entropy_grid_prepruning, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_entropy_grid_results_prepruning)
# start Grid search with Entropy Criterion
parameters = {'criterion': ['entropy'],
'splitter': ['best', 'random'],
'max_depth':[2,4,6,8,10,12],
'min_samples_split':[2,3,4],
'min_samples_leaf':[1,2],
'min_weight_fraction_leaf':[0.0],
'max_features':['auto', 'sqrt', 'log2'],
'random_state':[0],
'max_leaf_nodes':[None],
'min_impurity_decrease':[0.0],
'class_weight':[None,'balanced'],
'ccp_alpha':[0.0]}
des_tree_entropy_prepruning = DecisionTreeClassifier()
des_tree_entropy_grid_prepruning = GridSearchCV(des_tree_entropy_prepruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_entropy_grid_results_prepruning = perform_model(des_tree_entropy_grid_prepruning, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_entropy_grid_results_prepruning)
# start Grid search with Log_Loss Criterion
parameters = {'criterion': ['log_loss'],
'splitter': ['best', 'random'],
'max_depth':[2,4,6,8,10,12],
'min_samples_split':[2,3,4],
'min_samples_leaf':[1,2],
'min_weight_fraction_leaf':[0.0],
'max_features':['auto', 'sqrt', 'log2'],
'random_state':[0],
'max_leaf_nodes':[None],
'min_impurity_decrease':[0.0],
'class_weight':[None,'balanced'],
'ccp_alpha':[0.0]}
des_tree_logloss_prepruning = DecisionTreeClassifier()
des_tree_logloss_grid_prepruning = GridSearchCV(des_tree_logloss_prepruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_logloss_grid_results_prepruning = perform_model(des_tree_logloss_grid_prepruning, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_logloss_grid_results_prepruning)
# start Grid search with Log_Loss Criterion
parameters = {'criterion': ['log_loss'],
'splitter': ['best', 'random'],
'max_depth':[2,4,6,8,10,12],
'min_samples_split':[2,3,4],
'min_samples_leaf':[1,2],
'min_weight_fraction_leaf':[0.0],
'max_features':['auto', 'sqrt', 'log2'],
'random_state':[0],
'max_leaf_nodes':[None],
'min_impurity_decrease':[0.0],
'class_weight':[None,'balanced'],
'ccp_alpha':[0.0]}
des_tree_logloss_prepruning = DecisionTreeClassifier()
des_tree_logloss_grid_prepruning = GridSearchCV(des_tree_logloss_prepruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_logloss_grid_results_prepruning = perform_model(des_tree_logloss_grid_prepruning, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_logloss_grid_results_prepruning)
dFFeature_Subset_Result.drop(dFFeature_Subset_Result.index, inplace=True)
for item in Feature_Subset_Result:
dFFeature_Subset_Result.loc[dFFeature_Subset_Result.shape[0]] = item
print(dFFeature_Subset_Result[['Best Estimator','Training Set Accuracy', 'Testing Set Accuracy', 'Training Time', 'Testing Time',
'Avg. Cross Validation Score of Best Estimator']].to_markdown(tablefmt="pretty"))
Decision trees can easily overfit. One way to avoid it is to limit the growth of trees by setting constrains during Pre-Pruning. We can limit parameters like max_depth , min_samples etc.
But a most effective way is to use post pruning methods like cost complexity pruning. This helps to improve test accuracy and get a better model.
Cost complexity pruning is all about finding the right parameter for alpha.We will get the alpha values for this tree and will check the accuracy with the pruned trees.
#X_train = dFFeature_Subset_Result.iloc[0]['X_train']
#y_train = dFFeature_Subset_Result.iloc[0]['y_train']
#X_test = dFFeature_Subset_Result.iloc[0]['X_test']
#y_test = dFFeature_Subset_Result.iloc[0]['y_test']
ccp_alphas = dFFeature_Subset_Result.iloc[0]['ccp_alphas']
# For each alpha we will append our model to a list
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=0, class_weight='balanced', max_features='auto', ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
#We will remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()
train_acc = []
test_acc = []
for c in clfs:
y_train_pred = c.predict(X_train)
y_test_pred = c.predict(X_test)
train_acc.append(accuracy_score(y_train_pred,y_train))
test_acc.append(accuracy_score(y_test_pred,y_test))
plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()
df = pd.DataFrame({"ccp_alphas": ccp_alphas, "Train Acc.": train_acc, "Test Acc.": test_acc, "No. of Nodes": node_counts, "Depth": depth, "Accuracy Diff.": map(sub, train_acc, test_acc)})
print("Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy\n")
print(df.sort_values(by=['Accuracy Diff.']))
# start Grid search with Gini Criterion
parameters = {'criterion': ['gini'],
'max_features':['auto'],
'random_state':[0],
'class_weight':['balanced'],
'ccp_alpha':[1.481786e-02]}
des_tree_gini_PostPruning = DecisionTreeClassifier()
des_tree_gini_grid_PostPruning = GridSearchCV(des_tree_gini_PostPruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_gini_grid_results_PostPruning = perform_model(des_tree_gini_grid_PostPruning, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_gini_grid_results_PostPruning)
#X_train = dFFeature_Subset_Result.iloc[1]['X_train']
#y_train = dFFeature_Subset_Result.iloc[1]['y_train']
#X_test = dFFeature_Subset_Result.iloc[1]['X_test']
#y_test = dFFeature_Subset_Result.iloc[1]['y_test']
ccp_alphas = dFFeature_Subset_Result.iloc[1]['ccp_alphas']
# For each alpha we will append our model to a list
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=0, class_weight='balanced', max_features='auto', ccp_alpha=ccp_alpha)
clf.fit(X_train_smote, y_train_smote)
clfs.append(clf)
#We will remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()
train_acc = []
test_acc = []
for c in clfs:
y_train_pred = c.predict(X_train_smote)
y_test_pred = c.predict(X_test)
train_acc.append(accuracy_score(y_train_pred,y_train_smote))
test_acc.append(accuracy_score(y_test_pred,y_test))
plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()
df = pd.DataFrame({"ccp_alphas": ccp_alphas, "Train Acc.": train_acc, "Test Acc.": test_acc, "No. of Nodes": node_counts, "Depth": depth, "Accuracy Diff.": map(sub, train_acc, test_acc)})
print("Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy\n")
print(df.sort_values(by=['Accuracy Diff.']))
# start Grid search with Gini Criterion
parameters = {'criterion': ['gini'],
'max_features':['auto'],
'random_state':[0],
'class_weight':['balanced'],
'ccp_alpha':[0.014468]}
des_tree_gini_PostPruning = DecisionTreeClassifier()
des_tree_gini_grid_PostPruning = GridSearchCV(des_tree_gini_PostPruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_gini_grid_results_PostPruning = perform_model(des_tree_gini_grid_PostPruning, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_gini_grid_results_PostPruning)
#X_train = dFFeature_Subset_Result.iloc[4]['X_train']
#y_train = dFFeature_Subset_Result.iloc[4]['y_train']
#X_test = dFFeature_Subset_Result.iloc[4]['X_test']
#y_test = dFFeature_Subset_Result.iloc[4]['y_test']
ccp_alphas = dFFeature_Subset_Result.iloc[4]['ccp_alphas']
# For each alpha we will append our model to a list
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(criterion='entropy', random_state=0, class_weight='balanced', max_features='auto', ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
#We will remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()
train_acc = []
test_acc = []
for c in clfs:
y_train_pred = c.predict(X_train)
y_test_pred = c.predict(X_test)
train_acc.append(accuracy_score(y_train_pred,y_train))
test_acc.append(accuracy_score(y_test_pred,y_test))
plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()
df = pd.DataFrame({"ccp_alphas": ccp_alphas, "Train Acc.": train_acc, "Test Acc.": test_acc, "No. of Nodes": node_counts, "Depth": depth, "Accuracy Diff.": map(sub, train_acc, test_acc)})
print("Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy\n")
print(df.sort_values(by=['Accuracy Diff.']))
# start Grid search with Entropy Criterion
parameters = {'criterion': ['entropy'],
'max_features':['auto'],
'random_state':[0],
'class_weight':['balanced'],
'ccp_alpha':[2.954752e-02]}
des_tree_entropy_PostPruning = DecisionTreeClassifier()
des_tree_entropy_grid_PostPruning = GridSearchCV(des_tree_entropy_PostPruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_entropy_grid_results_PostPruning = perform_model(des_tree_entropy_grid_PostPruning, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_entropy_grid_results_PostPruning)
#X_train = dFFeature_Subset_Result.iloc[5]['X_train']
#y_train = dFFeature_Subset_Result.iloc[5]['y_train']
#X_test = dFFeature_Subset_Result.iloc[5]['X_test']
#y_test = dFFeature_Subset_Result.iloc[5]['y_test']
ccp_alphas = dFFeature_Subset_Result.iloc[5]['ccp_alphas']
# For each alpha we will append our model to a list
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(criterion='entropy', random_state=0, class_weight='balanced', max_features='auto', ccp_alpha=ccp_alpha)
clf.fit(X_train_smote, y_train_smote)
clfs.append(clf)
#We will remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()
train_acc = []
test_acc = []
for c in clfs:
y_train_pred = c.predict(X_train_smote)
y_test_pred = c.predict(X_test)
train_acc.append(accuracy_score(y_train_pred,y_train_smote))
test_acc.append(accuracy_score(y_test_pred,y_test))
plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()
df = pd.DataFrame({"ccp_alphas": ccp_alphas, "Train Acc.": train_acc, "Test Acc.": test_acc, "No. of Nodes": node_counts, "Depth": depth, "Accuracy Diff.": map(sub, train_acc, test_acc)})
print("Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy\n")
print(df.sort_values(by=['Accuracy Diff.']))
# start Grid search with Entropy Criterion
parameters = {'criterion': ['entropy'],
'max_features':['auto'],
'random_state':[0],
'class_weight':['balanced'],
'ccp_alpha':[0.032157]}
des_tree_entropy_PostPruning = DecisionTreeClassifier()
des_tree_entropy_grid_PostPruning = GridSearchCV(des_tree_entropy_PostPruning, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_entropy_grid_results_PostPruning = perform_model(des_tree_entropy_grid_PostPruning, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_entropy_grid_results_PostPruning)
#X_train = dFFeature_Subset_Result.iloc[6]['X_train']
#y_train = dFFeature_Subset_Result.iloc[6]['y_train']
#X_test = dFFeature_Subset_Result.iloc[6]['X_test']
#y_test = dFFeature_Subset_Result.iloc[6]['y_test']
ccp_alphas = dFFeature_Subset_Result.iloc[6]['ccp_alphas']
# For each alpha we will append our model to a list
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(criterion='log_loss', random_state=0, class_weight='balanced', max_features='auto', ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
#We will remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()
train_acc = []
test_acc = []
for c in clfs:
y_train_pred = c.predict(X_train)
y_test_pred = c.predict(X_test)
train_acc.append(accuracy_score(y_train_pred,y_train))
test_acc.append(accuracy_score(y_test_pred,y_test))
plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()
df = pd.DataFrame({"ccp_alphas": ccp_alphas, "Train Acc.": train_acc, "Test Acc.": test_acc, "No. of Nodes": node_counts, "Depth": depth, "Accuracy Diff.": map(sub, train_acc, test_acc)})
print("Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy\n")
print(df.sort_values(by=['Accuracy Diff.']))
# start Grid search with Log_Loss Criterion
parameters = {'criterion': ['log_loss'],
'max_features':['auto'],
'random_state':[0],
'class_weight':['balanced'],
'ccp_alpha':[2.954752e-02]}
des_tree_logloss = DecisionTreeClassifier()
des_tree_logloss_grid = GridSearchCV(des_tree_logloss, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_logloss_grid_results = perform_model(des_tree_logloss_grid, X_train, y_train, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_logloss_grid_results)
#X_train = dFFeature_Subset_Result.iloc[7]['X_train']
#y_train = dFFeature_Subset_Result.iloc[7]['y_train']
#X_test = dFFeature_Subset_Result.iloc[7]['X_test']
#y_test = dFFeature_Subset_Result.iloc[7]['y_test']
ccp_alphas = dFFeature_Subset_Result.iloc[7]['ccp_alphas']
# For each alpha we will append our model to a list
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(criterion='log_loss', random_state=0, class_weight='balanced', max_features='auto', ccp_alpha=ccp_alpha)
clf.fit(X_train_smote, y_train_smote)
clfs.append(clf)
#We will remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()
train_acc = []
test_acc = []
for c in clfs:
y_train_pred = c.predict(X_train_smote)
y_test_pred = c.predict(X_test)
train_acc.append(accuracy_score(y_train_pred,y_train_smote))
test_acc.append(accuracy_score(y_test_pred,y_test))
plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()
df = pd.DataFrame({"ccp_alphas": ccp_alphas, "Train Acc.": train_acc, "Test Acc.": test_acc, "No. of Nodes": node_counts, "Depth": depth, "Accuracy Diff.": map(sub, train_acc, test_acc)})
print("Print Sorted Dataframe on Accuracy Difference between Train Set Accuracy and Test Set Accuracy\n")
print(df.sort_values(by=['Accuracy Diff.']))
# start Grid search with Log_Loss Criterion
parameters = {'criterion': ['log_loss'],
'max_features':['auto'],
'random_state':[0],
'class_weight':['balanced'],
'ccp_alpha':[0.032157]}
des_tree_logloss = DecisionTreeClassifier()
des_tree_logloss_grid = GridSearchCV(des_tree_logloss, param_grid=parameters, cv=3, verbose=1, n_jobs=-1)
des_tree_logloss_grid_results = perform_model(des_tree_logloss_grid, X_train_smote, y_train_smote, X_test, y_test, labels)
# observe the attributes of the model
print_grid_search_attributes(des_tree_logloss_grid_results)
dFFeature_Subset_Result.drop(dFFeature_Subset_Result.index, inplace=True)
for item in Feature_Subset_Result:
dFFeature_Subset_Result.loc[dFFeature_Subset_Result.shape[0]] = item
print(dFFeature_Subset_Result[['Best Estimator','Training Set Accuracy', 'Testing Set Accuracy', 'Training Time', 'Testing Time',
'Avg. Cross Validation Score of Best Estimator']].to_markdown(tablefmt="pretty"))
Gini Gain vs Information Gain vs Gain Ratio
They are all attribute selection methods in decision tree
Gini Gain forces the resulting tree to be binary
Information Gain allows multiway splits.
Gain Ratio
#Print the details of the mode with best scores
print(dFFeature_Subset_Result.iloc[16].to_markdown(tablefmt="pretty"))